Introduction

This IPython notebook illustrates how to debug blocker output.

First, we need to import py_entitymatching package and other libraries as follows:



In [1]:

    
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd

Then, read the (sample) input tables for blocking purposes.



In [2]:

    
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'

# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'person_table_A.csv'
path_B = datasets_dir + os.sep + 'person_table_B.csv'



In [3]:

    
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='ID')
B = em.read_csv_metadata(path_B, key='ID')

Debugging Blocker Output

First, block using rule-based blocker



In [4]:

    
# First get features that can be used
feature_table = em.get_features_for_blocking(A, B, validate_inferred_attr_types=False)



In [5]:

    
# Create rule-based blocker
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev(ltuple, rtuple) < 0.8
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.8'], feature_table)









    Out[5]:





'_rule_0'



In [6]:

    
E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])









    



0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:00



In [7]:

    
E









    Out[7]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      rtable_name
    
  
  
    
      0
      0
      a5
      b5
      Alphonse Kemper
      Alfons Kemper



In [8]:

    
dbg = em.debug_blocker(E, A, B, output_size=5)



In [9]:

    
dbg









    Out[9]:







  
    
      
      _id
      similarity
      ltable_ID
      rtable_ID
      ltable_name
      ltable_address
      rtable_name
      rtable_address
    
  
  
    
      0
      0
      0.750000
      a2
      b3
      Michael Franklin
      1652 Stockton St, San Francisco
      Mike Franklin
      1652 Stockton St, San Francisco
    
    
      1
      1
      0.750000
      a3
      b2
      William Bridge
      3131 Webster St, San Francisco
      Bill Bridge
      3131 Webster St, San Francisco
    
    
      2
      2
      0.272727
      a4
      b2
      Binto George
      423 Powell St, San Francisco
      Bill Bridge
      3131 Webster St, San Francisco
    
    
      3
      3
      0.272727
      a4
      b3
      Binto George
      423 Powell St, San Francisco
      Mike Franklin
      1652 Stockton St, San Francisco
    
    
      4
      4
      0.272727
      a5
      b6
      Alphonse Kemper
      1702 Post Street, San Francisco
      Michael Brodie
      133 Clement Street, San Francisco



In [10]:

    
# Create rule-based blocker --- NOTE: we are creating a new blocker !!!
rb = em.RuleBasedBlocker()
# Add rule : block tuples if name_name_lev_sim(ltuple, rtuple) < 0.4
rb.add_rule(['name_name_lev_sim(ltuple, rtuple) < 0.4'], feature_table)









    Out[10]:





'_rule_0'



In [11]:

    
E = rb.block_tables(A, B, l_output_attrs=['name'], r_output_attrs=['name'])









    



0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:00



In [12]:

    
E









    Out[12]:







  
    
      
      _id
      ltable_ID
      rtable_ID
      ltable_name
      rtable_name
    
  
  
    
      0
      0
      a2
      b3
      Michael Franklin
      Mike Franklin
    
    
      1
      1
      a2
      b6
      Michael Franklin
      Michael Brodie
    
    
      2
      2
      a3
      b2
      William Bridge
      Bill Bridge
    
    
      3
      3
      a3
      b6
      William Bridge
      Michael Brodie
    
    
      4
      4
      a4
      b2
      Binto George
      Bill Bridge
    
    
      5
      5
      a5
      b5
      Alphonse Kemper
      Alfons Kemper



In [13]:

    
dbg = em.debug_blocker(E, A, B, output_size=5)



In [14]:

    
dbg









    Out[14]:







  
    
      
      _id
      similarity
      ltable_ID
      rtable_ID
      ltable_name
      ltable_address
      rtable_name
      rtable_address
    
  
  
    
      0
      0
      0.272727
      a3
      b1
      William Bridge
      3131 Webster St, San Francisco
      Mark Levene
      108 Clement St, San Francisco
    
    
      1
      1
      0.272727
      a3
      b3
      William Bridge
      3131 Webster St, San Francisco
      Mike Franklin
      1652 Stockton St, San Francisco
    
    
      2
      2
      0.272727
      a5
      b6
      Alphonse Kemper
      1702 Post Street, San Francisco
      Michael Brodie
      133 Clement Street, San Francisco
    
    
      3
      3
      0.272727
      a4
      b1
      Binto George
      423 Powell St, San Francisco
      Mark Levene
      108 Clement St, San Francisco
    
    
      4
      4
      0.272727
      a4
      b3
      Binto George
      423 Powell St, San Francisco
      Mike Franklin
      1652 Stockton St, San Francisco



In [ ]:

	_id	similarity	ltable_ID	rtable_ID	ltable_name	ltable_address	rtable_name	rtable_address
0	0	0.750000	a2	b3	Michael Franklin	1652 Stockton St, San Francisco	Mike Franklin	1652 Stockton St, San Francisco
1	1	0.750000	a3	b2	William Bridge	3131 Webster St, San Francisco	Bill Bridge	3131 Webster St, San Francisco
2	2	0.272727	a4	b2	Binto George	423 Powell St, San Francisco	Bill Bridge	3131 Webster St, San Francisco
3	3	0.272727	a4	b3	Binto George	423 Powell St, San Francisco	Mike Franklin	1652 Stockton St, San Francisco
4	4	0.272727	a5	b6	Alphonse Kemper	1702 Post Street, San Francisco	Michael Brodie	133 Clement Street, San Francisco